
library(tidyverse)
library(cobalt)
library(WeightIt)
library(MatchIt)


df_main <- read.csv("study_two_data.csv", fileEncoding="UTF-8-BOM")


df <- df_main %>%
  filter(GENDER == "M" | GENDER == "F") %>%
  mutate(First_Gen = ifelse(PARENT1_EDUCATION_LEVEL == "Four-Year College/University Graduate" |
                              PARENT1_EDUCATION_LEVEL == "Postgraduate Study" |
                              PARENT2_EDUCATION_LEVEL == "Four-Year College/University Graduate" |
                              PARENT2_EDUCATION_LEVEL == "Postgraduate Study", 0,1),
         perc_words = word_count_ner / essay_words, 
         perc_char = char_count_ner / essay_chars,
         Mexico = ifelse(IS_UC_MEXICAN == "Y",1,0),
         Cuba = ifelse(IS_UC_CUBAN == "Y",1,0),
         Puerto_Rico = ifelse(IS_UC_PUERTO_RICAN == "Y",1,0),
         Other = ifelse(IS_UC_OTHER_LATINO == "Y",1,0),
         Gender = GENDER)

df <- df %>%
  mutate(HSI = ifelse(HSI == "%", 0, 1)) %>%
  filter(essay_chars > 100)


# Table: Basic stats of how much namedropping in essays for the years, length
# of essays

mean(df[df$Treatment == 1,]$perc_char)
mean(df[df$Treatment == 0,]$perc_char)

mean(df[df$Treatment == 1,]$perc_words)
mean(df[df$Treatment == 0,]$perc_words)

mean(df[df$Treatment == 1,]$word_count_ner)
mean(df[df$Treatment == 0,]$word_count_ner)

mean(df[df$Treatment == 1,]$char_count_ner)
mean(df[df$Treatment == 0,]$char_count_ner)

df %>%
  filter(perc_words == 0 & Treatment == 1) %>%
  nrow() / nrow(df[df$Treatment == 1, ])

df %>%
  filter(perc_words == 0 & Treatment == 0) %>%
  nrow() / nrow(df[df$Treatment == 0, ])

################################################################################

# Same as above but for lateral/vertical applicants

lat <- df %>%
  filter(Type == "Lat")

vert <- df %>%
  filter(Type == "Vert")

mean(lat[lat$Treatment == 1,]$perc_char)
mean(lat[lat$Treatment == 0,]$perc_char)

mean(lat[lat$Treatment == 1,]$perc_words)
mean(lat[lat$Treatment == 0,]$perc_words)

mean(lat[lat$Treatment == 1,]$word_count_ner)
mean(lat[lat$Treatment == 0,]$word_count_ner)

mean(lat[lat$Treatment == 1,]$char_count_ner)
mean(lat[lat$Treatment == 0,]$char_count_ner)

lat %>%
  filter(perc_words == 0 & Treatment == 1) %>%
  nrow() / nrow(lat[lat$Treatment == 1, ])

lat %>%
  filter(perc_words == 0 & Treatment == 0) %>%
  nrow() / nrow(lat[lat$Treatment == 0, ])


mean(vert[vert$Treatment == 1,]$perc_char)
mean(vert[vert$Treatment == 0,]$perc_char)

mean(df[df$Treatment == 1,]$perc_words)
mean(df[df$Treatment == 0,]$perc_words)

mean(df[df$Treatment == 1,]$word_count_ner)
mean(df[df$Treatment == 0,]$word_count_ner)

mean(df[df$Treatment == 1,]$char_count_ner)
mean(df[df$Treatment == 0,]$char_count_ner)

df %>%
  filter(perc_words == 0 & Treatment == 1) %>%
  nrow() / nrow(df[df$Treatment == 1, ])

df %>%
  filter(perc_words == 0 & Treatment == 0) %>%
  nrow() / nrow(df[df$Treatment == 0, ])


# Histograms for char and words
pdf("perc_plots_all_essays_logged.pdf")
perc_char_plot <- df %>%
  filter(perc_char > 0) %>%
  mutate(Treatment = as.factor(if_else(Treatment == 0,"Control","Treatment"))) %>%
  ggplot(., aes(x = log(perc_char), fill = Treatment)) +
  geom_density(alpha = .5) +
  xlab("Logged Percentage of Characters (All Essays)") + 
  ylab("Density") +
  scale_fill_manual(values = c("red2", "turquoise2")) +
  scale_x_continuous( breaks = c(-4.61,-3.23,-1.89),
                      labels = c("1%",
                                 "4%",
                                 "15%<")) +
  guides(fill = guide_legend(title = "Group")) +
  theme_light()

perc_word_plot <- df %>%
  filter(perc_words > 0) %>%
  mutate(Treatment = as.factor(if_else(Treatment == 0,"Control","Treatment"))) %>%
  ggplot(., aes(x = log(perc_words), fill = Treatment)) +
  geom_density(alpha = .5) +
  xlab("Logged Percentage of Words (All Essays)") + 
  ylab("Density") +
  scale_fill_manual(values = c("red2", "turquoise2")) +
  scale_x_continuous(
    breaks = c(-4.46, -3.36,-2.02),
    labels = c("1%",
               "3%",
               "13%<")) +
  guides(fill = guide_legend(title = "Group")) +
  theme_light()

ggpubr::ggarrange(perc_char_plot, perc_word_plot,
                  ncol = 1,
                  nrow = 2,
                  common.legend = TRUE,
                  legend = "right")
#gridExtra::grid.arrange(perc_char_plot, perc_word_plot, common.legend = TRUE)
dev.off()

# All Essays

colnames(df)
set.cobalt.options(binary = "std")

#df_tmp <- df %>%
#  filter(perc_words > 0)

w_out <- weightit(Treatment ~ Mexico + Puerto_Rico + Cuba +Other+
                    Gender + First_Gen + Type + HSI +  Research
                  + Private ,
                  estimand = "ATE", method = "ps", data = df)

love_plot_names <- c("prop.score" = "Propensity Score",
                     "Puerto_Rico" = "Puerto Rico (Y)",
                     "Cuba" = "Cuba (Y)",
                     "Mexico" = "Mexico (Y)",
                     "Other" = "Latinx, Other (Y)",
                     "Gender_M" = "Gender (M)",
                     "First_Gen" = "First Gen. Status (Y)",
                     "Type_Vert" = "Transfer Pathway (Vertical)",
                     "HSI" = "HSI (Y)",
                     "Private" = "Private Institution (Y)",
                     "Research" = "Research Institution (Y)"
)


pdf("love_plot_all_transfer_applicants.pdf", height = 6)
love.plot(w_out, title = "Covariate Balance: All Essays",
          var.names = love_plot_names,
          shapes = c(16, 17),
          colors = c("turquoise3", "red2"))
dev.off()

bal.tab(w_out)


summary(glm(perc_words ~ Treatment, data = df, 
            weights = w_out$weights))

confint(glm(perc_words ~ Treatment, data = df, 
            weights = w_out$weights))

summary(glm(perc_char ~ Treatment, data = df, 
            weights = w_out$weights))

confint(glm(perc_char ~ Treatment, data=  df, 
            weights = w_out$weights))

################################################################################

# Study One: Weighting (propensity score matching, ATE)

# Lateral essays

colnames(df)
set.cobalt.options(binary = "std")

w_lat <- weightit(Treatment ~  Mexico + Puerto_Rico + Cuba +Other+
                    Gender + First_Gen + HSI +  Research
                  + Private ,
                  estimand = "ATE", method = "ps", data = lat)


love_plot_names_lat <- c("prop.score" = "Propensity Score",
                         "Puerto_Rico" = "Puerto Rico (Y)",
                         "Cuba" = "Cuba (Y)",
                         "Mexico" = "Mexico (Y)",
                         "Other" = "Latinx, Other (Y)",
                         "Gender_M" = "Gender (M)",
                         "First_Gen" = "First Gen. Status (Y)",
                         "HSI" = "HSI (Y)",
                         "Private" = "Private Institution (Y)",
                         "Research" = "Research Institution (Y)"
)

pdf("love_plot_lat_transfer_applicants.pdf", height = 6)
love.plot(w_lat, title = "Covariate Balance: Lateral Essays",
          var.names = love_plot_names_lat,
          shapes = c(16, 17),
          colors = c("turquoise3", "red2"))
dev.off()

bal.tab(w_lat)

summary(glm(perc_words ~ Treatment, data = lat, 
            weights = w_lat$weights))

confint(glm(perc_words ~ Treatment, data = lat, 
            weights = w_lat$weights))

summary(glm(perc_char ~ Treatment, data = lat, 
            weights = w_lat$weights))

confint(glm(perc_char ~ Treatment, data = lat, 
            weights = w_lat$weights))


################################################################################


# Study One: Weighting (propensity score matching, ATE)

# Vertical essays

colnames(df)
set.cobalt.options(binary = "std")

w_vert <- weightit(Treatment ~  Mexico + Puerto_Rico + Cuba +Other+
                     Gender + First_Gen + HSI +  Research
                   + Private ,
                   estimand = "ATE", method = "ps", data = vert)

love_plot_names_vert <- c("prop.score" = "Propensity Score",
                          "Puerto_Rico" = "Puerto Rico (Y)",
                          "Cuba" = "Cuba (Y)",
                          "Mexico" = "Mexico (Y)",
                          "Other" = "Latinx, Other (Y)",
                          "Gender_M" = "Gender (M)",
                          "First_Gen" = "First Gen. Status (Y)",
                          "HSI" = "HSI (Y)",
                          "Private" = "Private Institution (Y)",
                          "Research" = "Research Institution (Y)"
)

pdf("love_plot_vert_transfer_applicants.pdf", height = 6)
love.plot(w_vert, title = "Covariate Balance: Vertical Essays",
          var.names = love_plot_names_vert,
          shapes = c(16, 17),
          colors = c("turquoise3", "red2"))
dev.off()


bal.tab(w_vert)

summary(glm(perc_words ~ Treatment, data = vert, 
            weights = w_vert$weights))

confint(glm(perc_words ~ Treatment, data = vert, 
            weights = w_vert$weights))

summary(glm(perc_char ~ Treatment, data = vert, 
            weights = w_vert$weights))

confint(glm(perc_char ~ Treatment, data = vert, 
            weights = w_vert$weights))
